/* Copyright (C) 2000-2002 Lavtech.com corp. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
*/

#include "udm_config.h"

#include <stdlib.h>
#include <fcntl.h>
#include <string.h>
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_IO_H
#include <io.h>
#endif
#include <sys/stat.h>
#include <stdio.h>
#include <errno.h>

#include "udm_common.h"
#include "udm_utils.h"
#include "udm_unicode.h"
#include "udm_unidata.h"
#include "udm_searchtool.h"
#include "udm_boolean.h"
#include "udm_xmalloc.h"
#include "udm_spell.h"
#include "udm_stopwords.h"
#include "udm_word.h"
#include "udm_vars.h"
#include "udm_db_int.h"
#include "udm_url.h"
#include "udm_crc32.h"
#include "udm_parsehtml.h"
#include "udm_store.h"
#include "udm_doc.h"
#include "udm_conf.h"

/*
#define DEBUG_CACHE
*/

static int nbits[256] = {
  0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
  4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
};

static size_t bit_count(unsigned int v)
{
  return (nbits[v&0xFF]+nbits[(v>>8)&0xFF]+nbits[(v>>16)&0xFF]+nbits[(v>>24)&0xFF]);
}

static size_t bit_count_long(unsigned long v) {
  register int i;
  register size_t o = 0;
  for (i = 0; i < 8; i++) {
    o += nbits[v & 0xFF];
    v >>= 8;
  }
  return o;
}


/********** QSORT functions *******************************/

static int cmpword(UDM_URL_CRD *s1,UDM_URL_CRD *s2){
int res;
	if(!(res=s2->coord-s1->coord))
		if(!(res=s1->url_id-s2->url_id));
	return(res);
}

static int cmpurlid(UDM_URL_CRD *s1,UDM_URL_CRD *s2){
	int res;
	if(!(res=(s1->url_id-s2->url_id)))
		if(!(res=(UDM_WRDPOS(s1->coord)-UDM_WRDPOS(s2->coord))));
	return(res);
}

/****************************************************/

void UdmSortSearchWordsByWeight(UDM_URL_CRD *wrd,size_t num){
	qsort((void*)wrd,num,sizeof(*wrd),(qsort_cmp)cmpword);
	return;
}

void UdmSortSearchWordsByURL(UDM_URL_CRD *wrd,size_t num){
	qsort((void*)wrd,num,sizeof(*wrd),(qsort_cmp)cmpurlid);
	return;
}


/* Find topcount best results */
void UdmWrdTopSort(UDM_URL_CRD *wrd, size_t nwrd,size_t topcount){
	size_t j;
	UDM_URL_CRD w;
	
#ifdef DEBUG_TOP_SORT
	fprintf(stderr,"top+1=%d nwrd=%d\n",topcount+1,nwrd);
#endif
	
	UdmSortSearchWordsByWeight(wrd,topcount+1);
	
	for(j=topcount;j<nwrd;j++){
		register int res;
		if(!(res=(wrd[j].coord-wrd[topcount].coord)))
		if(!(res=(wrd[topcount].url_id-wrd[j].url_id)));
#ifdef DEBUG_TOP_SORT
fprintf(stderr,"(%d,%d) %d (%d,%d) %d\n",
	wrd[topcount].coord,wrd[topcount].url_id,topcount,
	wrd[j].cooord,wrd[j].url_id,j);
#endif
		if(res>0){
			size_t l,c,r;
			
			l=0;r=topcount;
			while(l<r){
				c=(l+r)/2;
				if(!(res=(wrd[c].coord-wrd[j].coord)))
				if(!(res=(wrd[j].url_id-wrd[c].url_id)));
				if(res>0){
					l=c+1;
				}else{
					r=c;
				}
			}
			w=wrd[topcount];
			memmove(&wrd[r+1],&wrd[r],(topcount-r)*sizeof(*wrd));
			wrd[r]=wrd[j];
			wrd[j]=w;
		}
	}
}

int UdmPrepare(UDM_AGENT * query,UDM_RESULT *Res){
	UDM_CHARSET * browser_cs, * local_cs, *sys_int;
	int  i, ctype;
	int * ustr, * lt, * lex;
	size_t ulen;
	int word_match   = UdmVarListFindInt(&query->Conf->Vars,"wm",UDM_MATCH_FULL);
	const char * txt = UdmVarListFindStr(&query->Conf->Vars,"q","");
	char *ltxt;
	size_t wlen;
	size_t llen;
	char wrd[UDM_MAXWORDSIZE*7+1], *clex;
	int uwrd[UDM_MAXWORDSIZE+1];
	UDM_CONV uni_lc, bc_uni;
	
	bzero(Res,sizeof(*Res));
	
	if (!(browser_cs = query->Conf->bcs)) {
		browser_cs=UdmGetCharSet("iso-8859-1");
	}
	if(!(local_cs = query->Conf->lcs)) {
		local_cs=UdmGetCharSet("iso-8859-1");
	}
	
	if (!(sys_int=UdmGetCharSet("sys-int")))
		return 0;
	
	UdmConvInit(&bc_uni,browser_cs,sys_int,0);
	UdmConvInit(&uni_lc,sys_int,local_cs,UDM_RECODE_HTML);
	
	ulen=strlen(txt);
	ustr=(int*)(malloc((sizeof(int))*(ulen+1)));
	UdmConv(&bc_uni,(char*)ustr,sizeof(ustr[0])*(ulen+1),txt,ulen+1);
	
	/* Create copy of query, converted into LocalCharset (for UdmTrack) */
	llen=ulen*7+1;
	ltxt=(char*)malloc(llen);
	UdmConv(&uni_lc,ltxt,llen,(char*)ustr,bc_uni.obytes);
	ltxt[uni_lc.obytes]='\0';
	UdmVarListReplaceStr(&query->Conf->Vars,"q-lc",ltxt);
	free(ltxt);
	
	/* Parse query and build boolean search stack*/
	UdmUniStrToLower(ustr);
	lex = UdmUniGetSepToken(ustr, &lt , &ctype);
	while(lex){
		wlen=lt-lex;
		memcpy(uwrd, lex, (udm_min(wlen, UDM_MAXWORDSIZE)) * sizeof(int));
		uwrd[udm_min(wlen, UDM_MAXWORDSIZE)] = 0;
		UdmConv(&uni_lc,wrd,sizeof(wrd),(char*)uwrd,sizeof(uwrd[0])*(wlen+1));
		clex = UdmTrim(wrd, " \t\r\n");
			
	  if (ctype != UDM_UNI_LETTER) {
		if(strchr("&|~+-()", clex[0]))                              /*((lex[0]=='&')||(lex[0]=='|')||(lex[0]=='~'))*/ {
			/* Boolean language operator */
		  for (i = 0; i < strlen(clex); i++) {
		        switch(clex[i]) {
			case '&':
			case '+':
			  Res->items[Res->nitems].cmd = UDM_STACK_AND;
			  break;
			case '|':
			  Res->items[Res->nitems].cmd = UDM_STACK_OR;
			  break;
			case '~':
			case '-':
			  Res->items[Res->nitems].cmd = UDM_STACK_NOT;
			  break;
			case '(':
			  Res->items[Res->nitems].cmd = UDM_STACK_LEFT;
			  break;
			case ')':
			  Res->items[Res->nitems].cmd = UDM_STACK_RIGHT;
			  break;
			}
			Res->items[Res->nitems].arg=0;
			Res->nitems++;
		  }
		}
	  } else {
			int addwrd=1;

			Res->items[Res->nitems].cmd=UDM_STACK_WORD;
			Res->items[Res->nitems].arg = 1L << (Res->WWList.nuniq);
			Res->nitems++;

			if(word_match==UDM_MATCH_FULL){
				/* Check stopword only when full word         */
				/* Substring searches should not exclude them */
				if(UdmStopListFind(&query->Conf->StopWords,wrd)||
						(query->Conf->WordParam.min_word_len>wlen)||
						(query->Conf->WordParam.max_word_len<wlen)){
				  UDM_WIDEWORD OWord;
				
				  OWord.len=strlen(wrd);
				  OWord.order=Res->WWList.nuniq;
				  OWord.count=0;
				  OWord.crcword=UdmStrCRC32(wrd);
				  OWord.word=wrd;
				  OWord.uword=uwrd;
				  OWord.origin = UDM_WORD_ORIGIN_STOP;
				  UdmWideWordListAdd(&Res->WWList, &OWord, 1);
				  addwrd=0;
				}
			}
			if(Res->WWList.nuniq >= UDM_MAXWORDPERQUERY-1){
				addwrd=0;
			}
			if(addwrd){
				UDM_WIDEWORD OWord;
				UDM_WIDEWORDLIST * forms;
				
				OWord.len=strlen(wrd);
				OWord.order=Res->WWList.nuniq;
				OWord.count=0;
				OWord.crcword=UdmStrCRC32(wrd);
				OWord.word=wrd;
				OWord.uword=uwrd;
				OWord.origin = UDM_WORD_ORIGIN_QUERY;
				UdmWideWordListAdd(&Res->WWList, &OWord, 1);
				
				if((forms=UdmAllForms(query,&OWord))){
					UDM_WIDEWORD FWord;
					int frm;
					for(frm=0;frm<forms->nwords;frm++){
						UdmConv(&uni_lc,wrd,sizeof(wrd),
							(char*)(forms->Word[frm].uword),
							sizeof(forms->Word[frm].uword[0])*(UdmUniLen(forms->Word[frm].uword)+1));
						FWord.len=strlen(wrd);
						FWord.order=Res->WWList.nuniq;
						FWord.count=0;
						FWord.crcword=UdmStrCRC32(wrd);
						FWord.word=wrd;
						FWord.uword=forms->Word[frm].uword;
						FWord.origin = forms->Word[frm].origin;
						
						UdmWideWordListAdd(&Res->WWList,&FWord, 1);
					}
					UdmWideWordListFree(forms);
					free(forms);
				}
				Res->WWList.nuniq++;
			}
		}
		lex = UdmUniGetSepToken(NULL, &lt, &ctype);
	}
	free(ustr);
		
	return(0);
}

static size_t UdmCalcPhraseWeight(int url_id,size_t weight,size_t uniq_words,size_t *phr,size_t words_in_query,int search_mode){
	size_t res=0;
	if(search_mode==UDM_MODE_PHRASE){
		res=phr[words_in_query-1];
	}else{
		size_t i;

		for(i=0;i<words_in_query-1;i++){
			res+=phr[i]*(i+1);
		}	
		if(res>0xFFFF)res=0xFFFF;
		if(phr[words_in_query-1]>0xFF)phr[words_in_query-1]=0xFF;
		res=(size_t)(phr[words_in_query-1]<<24)|(size_t)(uniq_words<<16)|res;
	}
	return res;
}

void UdmGroupByURL(UDM_AGENT *query,UDM_RESULT *Res){	
	UDM_STACK_ITEM temp_items[UDM_MAXSTACK];
	size_t	i,j=0,Doc_weight,item,*Doc_phr,phr_size;
	uint4	phr_beg,phr_mask,phr_weight;
	unsigned long    count;
	int search_mode = UdmSearchMode(UdmVarListFindStr(&query->Conf->Vars, "m", "all"));
	UDM_URL_CRD *Crd = Res->CoordList.Coords;
	
	if(!Res->CoordList.ncoords)return;

	Doc_weight=UDM_WRDSEC(Res->CoordList.Coords[0].coord);
	count=UDM_WRDMASK(Res->CoordList.Coords[0].coord);
	phr_size=Res->WWList.nwords*sizeof(size_t);
	phr_beg=UDM_WRDPOS(Res->CoordList.Coords[0].coord);
	phr_mask=UDM_WRDMASK(Res->CoordList.Coords[0].coord);
	phr_weight=UDM_WRDSEC(Res->CoordList.Coords[0].coord);

	if(search_mode==UDM_MODE_BOOL){
		memcpy(temp_items,Res->items,Res->nitems*sizeof(UDM_STACK_ITEM));
	}
	Doc_phr=(size_t*)malloc(phr_size);
	bzero(Doc_phr, phr_size);

	for(i=1;i<Res->CoordList.ncoords;i++){
		/* Group by url_id */
		if(Crd[j].url_id==Crd[i].url_id){
		/* Same document */
			count|=UDM_WRDMASK(Res->CoordList.Coords[i].coord);
			
			Doc_weight+=UDM_WRDSEC(Res->CoordList.Coords[i].coord);
			/* Calculate phrase (subphrase) summary weight */

			if((UDM_WRDPOS(Crd[i].coord)-phr_beg)<=((bit_count(phr_mask)+1))){
				phr_mask|=UDM_WRDMASK(Crd[i].coord);
				phr_weight+=UDM_WRDSEC(Crd[i].coord);
			}else{
				Doc_phr[bit_count(phr_mask)-1]+=phr_weight;
				phr_beg=UDM_WRDPOS(Crd[i].coord);
				phr_mask=UDM_WRDMASK(Crd[i].coord);
				phr_weight=UDM_WRDSEC(Crd[i].coord);
			}
		}else{
			/* Next document */
			Doc_phr[bit_count(phr_mask)-1]+=phr_weight;

			if(search_mode==UDM_MODE_BOOL){
				/* Create a copy of lexem array       */
				/* and change word masks by 1 or 0    */
				/* depending on whether word presents */
				/* in the search query being executed */
				for(item=0;item<Res->nitems;item++){
					if(temp_items[item].cmd==UDM_STACK_WORD){
						/* Change word mask by 1 or 0 */
						temp_items[item].arg=((Res->items[item].arg)&(count))?1:0;
					}
				}
				if(UdmCalcBoolItems(temp_items,Res->nitems)){
					Crd[j].coord=UdmCalcPhraseWeight(Crd[j].url_id,Doc_weight,bit_count_long(count),Doc_phr,Res->WWList.nuniq,search_mode);
					j++;
				}else{
					/* Skip this result */
				}
			}else
			if((search_mode==UDM_MODE_ALL)&&(bit_count(count)<Res->WWList.nuniq)){
				/* Skip this result */
			}else
			if((search_mode==UDM_MODE_PHRASE)&&(Doc_phr[Res->WWList.nwords-1]==0)){
				/* Skip this result */
			}else{
				Crd[j].coord=UdmCalcPhraseWeight(Crd[j].url_id,Doc_weight,bit_count(count),Doc_phr,Res->WWList.nuniq,search_mode);
				j++;
			}
			Doc_weight=UDM_WRDSEC(Crd[i].coord);
			phr_mask=UDM_WRDMASK(Crd[i].coord);
			phr_beg=UDM_WRDPOS(Crd[i].coord);
			phr_weight=UDM_WRDSEC(Crd[i].coord);
			count=UDM_WRDMASK(Crd[i].coord);
			bzero(Doc_phr, phr_size);
			Crd[j]=Crd[i];
		}
	}

	/* Check last word */
	Doc_phr[bit_count(phr_mask)-1]+=phr_weight;
	
	switch(search_mode){
		case UDM_MODE_BOOL:
			for(item=0;item<Res->nitems;item++){
				if(temp_items[item].cmd==UDM_STACK_WORD){
					/* Change word mask by 1 or 0 */
					temp_items[item].arg=((Res->items[item].arg)&(count))?1:0;
				}
			}
			Res->CoordList.ncoords=(UdmCalcBoolItems(temp_items,Res->nitems))?j+1:j;
			break;
		case UDM_MODE_PHRASE:
			Res->CoordList.ncoords=(Doc_phr[Res->WWList.nuniq-1]>0)?j+1:j;
			break;
		case UDM_MODE_ALL:
			Res->CoordList.ncoords=(bit_count(count)>=Res->WWList.nuniq)?j+1:j;
			break;
		case UDM_MODE_ANY:
		default:
			Res->CoordList.ncoords=j+1;
			break;
	}
	Crd[j].coord=UdmCalcPhraseWeight(Crd[j].url_id,Doc_weight,count,Doc_phr,Res->WWList.nuniq,search_mode);
	free(Doc_phr);
	return;
}

typedef struct{
	/*char	phr_len;*/
	uint4	count;
	uint4	weight;
} UDM_PHR_PAR;



int UdmParseQueryString(UDM_AGENT * Agent,UDM_VARLIST * vars,char * query_string){
	char * tok, *lt;
	size_t len;
	char *str = (char *)malloc((len = strlen(query_string)) + 7);

	if (str == NULL) return 1;
	
	tok=strtok_r(query_string,"&",&lt);
	while(tok){
		char empty[]="";
		char * val;
		const char * lim;
		
		if((val=strchr(tok,'='))){
			*val='\0';
			val++;
		}else{
			val=empty;
		}
		UdmUnescapeCGIQuery(str,val);
		UdmVarListReplaceStr(vars,tok,str);
		sprintf(str,"Limit-%s",tok);
		if((lim=UdmVarListFindStr(vars,str,NULL))){
			int ltype=0;
			char * type, * fname, * llt;
			strncpy(str, lim, len);
			
			if((type=strtok_r(str,":",&llt))){
				if(!strcasecmp(type,"nested"))ltype=0;
				if(!strcasecmp(type,"cat")) ltype=0;
				if(!strcasecmp(type,"time"))ltype=1;
				if(!strcasecmp(type,"linear"))ltype=2;
				if(!strcasecmp(type,"host"))ltype=3;
				if(!strcasecmp(type, "tag")) ltype=3;
				if((fname=strtok_r(NULL,":",&llt))){
				  if (strlen(val))
					UdmAddSearchLimit(Agent,ltype,fname,val);
				}
			}
		}
		tok=strtok_r(NULL,"&",&lt);
	}
	
	UDM_FREE(str);
	return 0;
}


char * UdmHlConvert(UDM_WIDEWORDLIST *List,const char * src, UDM_CHARSET * lcs, UDM_CHARSET * bcs) {
	int		*tok, *lt, ctype, *uni;
	char		*hpart, *htxt;
	size_t		len;
	UDM_CONV	lc_uni, uni_bc;
	UDM_CHARSET	*sys_int;
	
	if(!src)return NULL;
	
	if ((len = strlen(src)) == 0) return NULL;
	hpart = (char*)malloc(len * 10 + 10);
	htxt = (char*)malloc(len * 10 + 10);
	htxt[0]='\0';
	
	sys_int=UdmGetCharSet("sys-int");
	UdmConvInit(&lc_uni,lcs,sys_int,0);
	UdmConvInit(&uni_bc,sys_int,bcs,UDM_RECODE_HTML);
	
	/* Convert to unicode */
	uni = (int *)malloc((len + 10) * sizeof(int));
	UdmConv(&lc_uni,(char*)uni,sizeof(uni[0])*(len+10),src,len+1);
	
	/* Parse unicode string */
	tok = UdmUniGetSepToken(uni, &lt, &ctype);
	while(tok){
		int found=0;
		size_t slen,flen;
		int euchar;
		size_t uw;

		flen=lt-tok;

		/* Convert token to BrowserCharset */
		euchar=tok[flen];
		tok[flen]=0;
		hpart[0]='\0';
		
		UdmConv(&uni_bc,hpart,len*10+10,(char*)tok,sizeof(*tok)*flen);
		
		tok[flen]=euchar;

		/* Check that it is word to be marked */
		for(uw=0;uw<List->nwords;uw++){
			slen = List->Word[uw].len;
			if((flen==slen)&&(!UdmUniStrNCaseCmp(tok,List->Word[uw].uword,slen))){
				found=1;
				break;
			}
		}
		if(found)strcat(htxt,"\2");
		strcat(htxt,hpart);
		if(found)strcat(htxt,"\3");
		
		tok = UdmUniGetSepToken(NULL, &lt, &ctype);
	}
	free(hpart);
	free(uni);
	/*
	fprintf(stderr,"otxt='%s'\n",src);
	fprintf(stderr,"htxt='%s'\n",htxt);
	*/
	return(htxt);
}

int UdmResHlConvert(UDM_RESULT *Res,UDM_CHARSET *lcs,UDM_CHARSET *bcs){
	size_t		i;
	UDM_CONV	lc_bc;
	
	/* Convert word list */
	UdmConvInit(&lc_bc,lcs,bcs,UDM_RECODE_HTML);
	for(i=0;i<Res->WWList.nwords;i++) {
		UDM_WIDEWORD	*W=&Res->WWList.Word[i];
		size_t		len=strlen(W->word);
		char		*newval=(char*)malloc(len*9+1);
		
		UdmConv(&lc_bc,newval,len*9+1,W->word,len+1);
		free(W->word);
		W->word=newval;
	}
	
	/* Convert document sections */
	for(i=0;i<Res->num_rows;i++){
		UDM_DOCUMENT	*D=&Res->Doc[i];
		size_t		sec;
		
		for(sec=0;sec<D->Sections.nvars;sec++){
			UDM_VAR	*Var=&D->Sections.Var[sec];
			char *newval=UdmHlConvert(&Res->WWList,Var->val,lcs,bcs);
			UDM_FREE(Var->val);
			Var->val=newval;
		}
	}
	return UDM_OK;
}

/*
static int * UdmUniStrWWL(int *s, UDM_WIDEWORDLIST *wwl, int *c, size_t *len) {
  int sc;
  register size_t i;

  while((sc = *s++) != 0) {
    for(i = 0; i < wwl->nwords; i++) {
      if ((sc == c[i]) && (wwl->Word[i].origin != UDM_WORD_ORIGIN_STOP)) {
	if (memcmp(s, &(wwl->Word[i].uword[1]), len[i] * sizeof(int)) == 0) {
	  s--;
	  return s;
	}
      }
    }
  }
  return NULL;
}

static int UdmUniNSpace(int c) {
	if (c == 0x0020) return 0;
	if (c == 0x0026) return 0;
	if (c == 0x00A0) return 0;
	if (c == 0x1680) return 0;
	if ((c >= 0x2000) && (c <= 0x200B)) return 0;
	if (c == 0x202F) return 0;
	if (c == 0x3000) return 0;
	return 1;
}


__INDLIB__ char * UdmExcerptDoc(UDM_AGENT *query, UDM_RESULT *Res, UDM_DOCUMENT *Doc, size_t size) {
  char *HDoc,*HEnd;
  const char *htok, *last;
  const char *lcharset;
  UDM_CHARSET *lcs = NULL, *dcs = NULL, *sys_int;
  UDM_HTMLTOK tag;
  int *start, *end, *uni, ures, *p, *oi, dot[] = {0x2e, 0x2e, 0x2e, 0};
  char *os;
  int *c;
  size_t *wlen, i, len;
  UDM_CONV dc_uni, uni_lc;

  if (query->Conf->lcs == NULL) {
    lcharset = UdmVarListFindStr(&query->Conf->Vars, "CS", "");
    if (lcharset == NULL || (!strcmp(lcharset, ""))) {
      lcharset = UdmVarListFindStr(&query->Conf->Vars, "LocalCharset", "iso-8859-1");
    }
    lcs = UdmGetCharSet(lcharset);
  } else {
    lcs = query->Conf->lcs;
  }
  dcs = UdmGetCharSet(UdmVarListFindStr(&Doc->Sections,"Charset","iso-8859-1"));
  
  if (!lcs || !dcs) return NULL;
  if (!(sys_int=UdmGetCharSet("sys-int")))
    return NULL;
  
  UdmConvInit(&dc_uni,dcs,sys_int,0);
  UdmConvInit(&uni_lc,sys_int,lcs,UDM_RECODE_HTML);

  c = (int *) malloc(Res->WWList.nwords * sizeof(int));
  if (c == NULL) { return NULL; }
  wlen = (size_t *) malloc(Res->WWList.nwords * sizeof(size_t));
  if (wlen == NULL) {
    UDM_FREE(c);
    return NULL;
  }
  for (i = 0; i < Res->WWList.nwords; i++) {
    wlen[i] = Res->WWList.Word[i].len - 1;
    c[i] = Res->WWList.Word[i].uword[0];
  }
  if ((oi = (int *)malloc(512 * sizeof(int))) == NULL) {
    UDM_FREE(c); UDM_FREE(wlen);
    return NULL;
  }
  oi[0]=0;

  if(UdmUnStoreDoc(query, Doc)) {
    UDM_FREE(oi); UDM_FREE(c); UDM_FREE(wlen);
    return NULL;
  }
  
  if ((HEnd=HDoc = (char *)malloc(Doc->Buf.size)) == NULL) {
    UDM_FREE(oi); UDM_FREE(c); UDM_FREE(wlen);
    return NULL;
  }
  HDoc[0]='\0';

  if ( (uni = (int *)malloc((Doc->Buf.size + 10) * sizeof(int)) ) == NULL) {
    UDM_FREE(oi); UDM_FREE(c); UDM_FREE(wlen); UDM_FREE(HDoc);
    return NULL;
  }

  UdmHTMLTOKInit(&tag); 
  
  for(htok = UdmHTMLToken(Doc->Buf.content, &last, &tag) ; htok ; )
  {
    switch(tag.type) {
    case UDM_HTML_TXT:
      memcpy(HEnd,htok,(size_t)(last-htok));
      HEnd+= last-htok;
      HEnd[0]='\0';
      break;
    case UDM_HTML_COM:
    case UDM_HTML_TAG:
    default:
      break;
    }
    htok = UdmHTMLToken(NULL, &last, &tag);
  }
  
  if (HEnd==HDoc)return NULL;
  
  len = HEnd-HDoc;
  

  UdmConv(&dc_uni,(char*)uni,sizeof(*uni)*(Doc->Buf.size+10),HDoc,Doc->Buf.size);
  UdmUniStrToLower(uni);
  UDM_FREE(HDoc);

  for(p = uni; ((p = UdmUniStrWWL(p, &(Res->WWList), c, wlen)) != NULL) && (UdmUniLen(oi) < 256) && (p < (uni + len)) ;) {
    if ( ( (p > uni) && (!UdmUniNSpace(*(p-1))) ) || (p == uni)  ) {

      start = max(p - 64, uni);
      end = min(p + 64, uni + len);
      while(UdmUniNSpace(*start) && (start < end)) start++;
      while(UdmUniNSpace(*end) && (start < end)) end--;
      if (start != uni) UdmUniStrCat(oi, dot);
      ures = *end; *end = 0; UdmUniStrCat(oi, start); *end = ures;
      if (end != uni + len) UdmUniStrCat(oi, dot);
      p = end;
    } else p++;
  }

  if ((os = (char *)malloc(5120 * sizeof(char))) == NULL) {
    UDM_FREE(oi);
    UDM_FREE(c);
    UDM_FREE(wlen);
    return NULL;
  }
  
  UdmConv(&uni_lc,os,sizeof(*os)*5119,(char*)oi,sizeof(*oi)*UdmUniLen(oi));
  os[5119]='\0';
  
  {
    register char *cc;
    while ((cc = strchr(os, '\n')) != NULL) {
      *cc = ' ';
    }
    while ((cc = strchr(os, '\r')) != NULL) {
      *cc = ' ';
    }
    while ((cc = strchr(os, '\t')) != NULL) {
      *cc = ' ';
    }
  }
  UDM_FREE(c);
  UDM_FREE(wlen);
  UDM_FREE(oi);
  return os;
}
*/
